rm(list=ls())
library(quanteda)
require(seededlda)
require(lubridate)
library(quanteda.textstats)
library(quanteda.textplots)
library(quanteda.textmodels)
library(stringi)
library(here)
library(stringr)
library(readxl)
library(tidyverse)

here::i_am("manuscript/LSQ/replication/dc_inbox_analysis.R")

data_path<-here("manuscript/LSQ/replication/dcinbox_export.csv")
inbox<-read.csv(data_path)

les_path<-here("manuscript/LSQ/replication/CELHouse93to116.xlsx")
les<-read_excel(les_path)

#### Prep the data ####

#Clean up LES variable names
les <- les %>% 
  rename(icpsr = `ICPSR number, according to Poole and Rosenthal`, 
         Congress = `Congress number`, 
         State = `Two-letter state code`, 
         District = `Congressional district number`,
         name = `Legislator name, as given in THOMAS`,
         les = `Legislative Effectiveness Score (1-5-10)`,
         BioGuide.ID = `Indicator for member in bioguide`,
         dw_nom = `First-dimension DW-NOMINATE score`,
         black = '1 = African American',
         latino = '1 = Latino/a',
         woman = '1 = female') %>%
  select(icpsr, Congress, State, District, name, les, BioGuide.ID, dw_nom, black, latino, woman)

#Merge DCInbox and LES 
inbox<-left_join(inbox, les)
inbox<-inbox %>%
  filter(dw_nom!="NA")

#Put all emails into lower case
inbox$Body<-tolower(inbox$Body)

#Determine if the email mentions "repeal" at all
inbox<-inbox %>%
  mutate(repeal= if_else(str_detect(Body, "repeal"), 1, 0))

#Identify if the email mentions the affordable care act (or related terms)
aca_words<-c("affordable care act | aca | obamacare | obama care | repeal and replace ")
inbox<-inbox %>%
  mutate(aca= if_else(str_detect(Body, aca_words), 1, 0))

#Create equally sized terciles for each party based on nominate score
inbox <- inbox %>% 
  group_by(Party) %>%
  mutate(pty_tercile = ntile(dw_nom, 3))

#Label the terciles
inbox<-inbox %>%
  mutate(party_ideo  = case_when(Party=="Democrat" & pty_tercile ==1 ~ "Lib Dem",
                                 Party=="Democrat" & pty_tercile ==2 ~ "Ctr Dem",
                                 Party=="Democrat" & pty_tercile ==3 ~ "Mod Dem",
                                 Party=="Republican" & pty_tercile ==1 ~ "Mod GOP",
                                 Party=="Republican" & pty_tercile ==2 ~ "Ctr GOP",
                                 Party=="Republican" & pty_tercile ==3 ~ "Con GOP"))
         
#Calculate the rate of repeal mentions in newsletters
inbox_sum <- Rmisc::summarySE(inbox, measurevar="repeal", groupvars=c("party_ideo"))

inbox_sum<-inbox_sum %>%
  filter(party_ideo != "NA") %>%
  arrange(factor(party_ideo, levels = c('Lib Dem', 'Ctr Dem', 'Mod Dem', 'Mod GOP', 'Ctr GOP', 'Con GOP')))

#Calculate the rate of repeal mentions in newsletters that do *not* mention the ACA
inbox_no_aca<-inbox %>%
  filter(aca==0) 
inbox_sum_no_aca <- Rmisc::summarySE(inbox_no_aca, measurevar="repeal", groupvars=c("party_ideo"))

inbox_sum_no_aca<-inbox_sum_no_aca %>%
  filter(party_ideo != "NA") %>%
  arrange(factor(party_ideo, levels = c('Lib Dem', 'Ctr Dem', 'Mod Dem', 'Mod GOP', 'Ctr GOP', 'Con GOP')))

#### Create Bar Charts with CIs ####

# Color selection
party_colors <- c("Con GOP" = "#b2182b", "Ctr GOP" = "#ef8a62", "Mod GOP" = "#fddbc7",
                  "Mod Dem" = "#d1e5f0", "Ctr Dem" = "#67a9cf", "Lib Dem" = "#2166ac")


#Bar Chart with Entire Corpus 
bar<-ggplot(inbox_sum, aes(x=factor(party_ideo, levels=party_ideo), y=repeal, fill=party_ideo))
b1<-bar+geom_bar(position=position_dodge(), stat="identity") +
  scale_fill_manual(values = party_colors) + 
  scale_x_discrete(limits = c("Lib Dem", "Ctr Dem", "Mod Dem", 
                              "Mod GOP", "Ctr GOP", "Con GOP"),
                   labels = c("Liberal\nDemocrat", "Mainstream\nDemocrat", 
                              "Moderate\nDemocrat", "Moderate\nRepublican", "Mainstream\nRepublican", 
                              "Conservative\nRepublican")) +
  geom_errorbar(aes(ymin=repeal-ci, ymax=repeal+ci),
                width=.2, # Width of the error bars
                position=position_dodge(.9))
b2<-b1 + theme_minimal() + theme(legend.position = "none") +
  labs(x = "Party-Ideological Group",
       y = "Rate of Word 'Repeal' in Newsletter",
       title = "Repeal Mentions in Congressional Official eNewsletters",
       subtitle = "Entire Corpus of eNewsletters: 2010-2021",
       caption = "Data Source: DCInbox")
b2

#Bar Chart for Corpus with ACA mentions excluded 
bar2<-ggplot(inbox_sum_no_aca, aes(x=factor(party_ideo, levels=party_ideo), y=repeal, fill=party_ideo))
b3<-bar2+geom_bar(position=position_dodge(), stat="identity") +
  scale_fill_manual(values = party_colors) + 
  scale_x_discrete(limits = c("Lib Dem", "Ctr Dem", "Mod Dem", 
                            "Mod GOP", "Ctr GOP", "Con GOP"),
                 labels = c("Liberal\nDemocrat", "Mainstream\nDemocrat", 
                            "Moderate\nDemocrat", "Moderate\nRepublican", "Mainstream\nRepublican", 
                            "Conservative\nRepublican")) +
  geom_errorbar(aes(ymin=repeal-ci, ymax=repeal+ci),
                width=.2, # Width of the error bars
                position=position_dodge(.9))
b4<-b3 + theme_minimal() + theme(legend.position = "none") +
  labs(x = "Party-Ideological Group",
       y = "Rate of Word 'Repeal' in Newsletter",
       title = "Repeal Mentions in Congressional Official eNewsletters",
       subtitle = "Omitting Newsletters with any mention of the ACA: 2010-2021",
       caption = "Data Source: DCInbox")
b4
